1. Introduction and Aims

Two datasets were generated using the 10X Genomics Chromium 3’ scRNA-Seq platform:

species experiment_name run_number lane_number sequencer approximate_number_of_cells
pb straight bleed experiment 22252 5 Hiseq 4000 30,000
pb 1:1 mix experiment 24284 1 & 2 Hiseq 2500 5,000

This data has been processed using CellRanger into counts tables. This initial analysis gave the following metrics:

Pb 1:1 mix experiment (run #: 24284 lanes 1 and 2 (Hiseq 2500)):

We will load this data in and for each run:

A. Define ‘cells’

B. Filter poor quality cells out

C. Dimensionality Reduction and Clustering

D. Remove Doublets

E. Predict life Cycle Stage (Using Bulk RNA-Seq Correlation)

2. Read in the data

Load the required packages

[1] "Seurat is loaded correctly"
[1] "cowplot is loaded correctly"
[1] "gridExtra is loaded correctly"
[1] "grid is loaded correctly"
[1] "Hmisc is loaded correctly"
[1] "dplyr is loaded correctly"
[1] "scales is loaded correctly"
[1] "ggpubr is loaded correctly"
[1] "patchwork is loaded correctly"

Import GTF file

This will be helpful later on. This contains annotations for each gene:

##Import gtf file:
gtf <- read.table("/Users/Andy/GCSKO/GCSKO_analysis_git/data/Reference/Pberghei.gtf", sep="\t", header = FALSE)
head(gtf)

Read in the Data

## read in 10x output 
tenx5k_raw_data <- Read10X("/Users/Andy/GCSKO/GCSKO_analysis_git/data/10X/tenx_24284")

## Create Seurat object
tenx5k <- CreateSeuratObject(counts = tenx5k_raw_data, min.cells = 0, min.features = 0, project = "GCSKO")
Feature names cannot have underscores ('_'), replacing with dashes ('-')
## add experiment to meta data for merging later
tenx5k@meta.data$experiment <- "tenx5k"

3. Defining Cells vs. Background

Plot a knee plot and then use a mixture model to define where the cells vs. background lie

## interesting reference material for this section can be found here: https://hemberg-lab.github.io/scRNA.seq.course/processing-raw-scrna-seq-data.html 

## get the nUMIs
umi_per_barcode <- as.data.frame(tenx5k@meta.data$nCount_RNA)

## remove zeros as these have issues when you log them and make the model later:
umi_per_barcode <- as.data.frame(umi_per_barcode[!(umi_per_barcode$`tenx5k@meta.data$nCount_RNA`==0), ])

## get a rank for each barcode
barcode_rank <- rank(-umi_per_barcode[,1])

## then make into a list
lib_size <- (umi_per_barcode[,1])

## then log this
log_lib_size <- log10(umi_per_barcode[,1])

##plot
#plot(barcode_rank, log_lib_size, xlim=c(1,100000))

## order the barcode ranks
o <- order(barcode_rank)

## reorder the library size, barcode rank by their rank
log_lib_size <- log_lib_size[o]
barcode_rank <- barcode_rank[o]
lib_size <- lib_size[o]

make a mixture model to determine the knee of the plot

## set a seed for the mixture model
set.seed(-92497)

## mixture model calculation 
require("mixtools")
mix <- normalmixEM(log_lib_size)
One of the variances is going to zero;  trying new starting values.
One of the variances is going to zero;  trying new starting values.
number of iterations= 17 
## plot result
plot(mix, which=2, xlab2="log(mol per cell)")

Find where the distributions intersect (i.e. where cells vs. background is)

## identify where the split between the distributions is
p1 <- dnorm(log_lib_size, mean=mix$mu[1], sd=mix$sigma[1])
p2 <- dnorm(log_lib_size, mean=mix$mu[2], sd=mix$sigma[2])
if (mix$mu[1] < mix$mu[2]) {
    split <- min(log_lib_size[p2 > p1])
} else {
    split <- min(log_lib_size[p1 > p2])
}

## print split
split
[1] 1.724276

View the initial result

## log the barcode rank
log_barcode_rank <- log10(barcode_rank)

## plot
plot(log_barcode_rank, log_lib_size, xlim=c(1,6))
## add the split as a line on the plot
abline(h=split, col="red")

Final Figures:

## make the results of the above functions into a dataframe
df_barcodes <- as.data.frame(cbind(barcode_rank, log_lib_size, lib_size), row.names = NULL)

## add a column for if it is a cell or not
df_barcodes$cell = rownames(df_barcodes) %in% which(df_barcodes$log_lib_size > split)

## change value to a numeric
df_barcodes$cell <- as.numeric(df_barcodes$cell)

## change the 0 to a 2 for ease of handling
df_barcodes$cell[df_barcodes$cell<1] <- 2

## rename the numerics into cells or background
df_barcodes$cell[df_barcodes$cell == 1] <- "Cells"
df_barcodes$cell[df_barcodes$cell == 2] <- "Background"

## extract the cutoff for cells do you can plot the lines
boundary <- as.numeric(sum(df_barcodes$cell == "Cells"))
split <- 10^split

## make the plot
barcode_plot <- ggplot(df_barcodes, aes(x=barcode_rank, y=lib_size, colour = cell, theme_size = 40)) +
  ## make into a dot plot
  geom_point(size = 1, shape = 16) +
  ## make the axis into log and specify breaks
  scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
  ## make the axis into log and specify breaks
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
  annotation_logticks() +
  ## change colours of plot
  scale_color_manual(values=c("#bdbdbd", "#5ba43a"), labels = c("Background", "Cells")) +
  ## change aes of legend
  theme(legend.position="bottom", text = element_text(size=25), legend.text=element_text(size=25), axis.text=element_text(size=25)) +
  ## add the lines on the plot
  geom_segment(aes(x = 0, y = split, xend = boundary, yend = split), colour = "black", alpha = 0.01) +
  geom_segment(aes(x = boundary, y = 0, xend = boundary, yend = split), colour = "black", alpha = 0.01) +
  ## change the axis labels
  labs(x = "Barcodes", y = "UMI Counts", colour="Cell Designation") +
  ## change the size of the legend
  guides(colour = guide_legend(override.aes = list(size=10))) +
  ## fix axis
  coord_fixed() +
  ## make it look pretty
  theme_light()

## print the plot
barcode_plot

## save the plot
ggsave("barcode_plot_5k.png", plot = barcode_plot, device = "png", height = 15, width = 15, units = "cm", path = "/Users/Andy/GCSKO/GCSKO_analysis_git/images_to_export/")

so the number of cells that is retained is:


Background      Cells 
    138879       7762 

Filter the object

## extract the nCount and row names from the Seurat object
upb <- data.frame(nCount_RNA = tenx5k@meta.data$nCount_RNA, row.names = rownames(tenx5k@meta.data))

## make blank column for rank
upb$rank <- NA

## order by nCounts
order.scores <- order(upb$nCount_RNA, decreasing = TRUE)

## add a rank to this column based on the ordered nCount
upb$rank[order.scores] <- 1:nrow(upb)

## inspect
#head(upb)

## make a list of cells to retain in the Seurat object
keep_cells <- rownames(upb[which(upb$rank < 7763),])

## subset Seurat object to include cells and discard background
pb_sex <- subset(tenx5k, cells = keep_cells)

4. Filter Out Poor-Quality Cells

Filter Mitochondrial %

Mitochondrial cell counts

## extract mitochondrial genes 
#mito_genes <- gtf[which(gtf$V3 == "rRNA"),]$V9
#mito_genes <- gsub(";.*","", gsub("gene_id ", "", mito_genes))
#paste("These are the mitochondrial genes")
#head(mito_genes)

## extract mito genes
mito_genes <- pb_sex@assays$RNA@counts@Dimnames[[1]][grep("^PBANKA-MIT",pb_sex@assays$RNA@counts@Dimnames[[1]])]

## plot the genes individually
VlnPlot(object = pb_sex, features = mito_genes, pt.size = 0.01)
All cells have the same value of PBANKA-MIT0010.All cells have the same value of PBANKA-MIT0020.All cells have the same value of PBANKA-MIT0030.All cells have the same value of PBANKA-MIT0040.All cells have the same value of PBANKA-MIT0050.All cells have the same value of PBANKA-MIT0090.All cells have the same value of PBANKA-MIT0130.All cells have the same value of PBANKA-MIT0140.All cells have the same value of PBANKA-MIT0150.All cells have the same value of PBANKA-MIT0160.All cells have the same value of PBANKA-MIT0200.All cells have the same value of PBANKA-MIT0210.All cells have the same value of PBANKA-MIT0230.All cells have the same value of PBANKA-MIT0240.All cells have the same value of PBANKA-MIT0250.All cells have the same value of PBANKA-MIT0260.All cells have the same value of PBANKA-MIT0290.All cells have the same value of PBANKA-MIT0300.All cells have the same value of PBANKA-MIT0320.All cells have the same value of PBANKA-MIT0330.All cells have the same value of PBANKA-MIT0340.All cells have the same value of PBANKA-MIT0370.

## make a percentage mitocondrial for each cell (this will work as long as you filter cells out with zero counts)
pb_sex <- PercentageFeatureSet(pb_sex, pattern = "^PBANKA-MIT", col.name = "percent.mt")

plot percentage mitochondrial

## plot for percentage of mitochondrial reads
v1 <- VlnPlot(object = pb_sex, features = "percent.mt", pt.size = 0.01) +
  ## add a line where we will filter
  geom_abline(intercept = 20, col="blue") +
  ## change labels
  labs(x = "",y = "% Mitochondrial Reads", title = "Mitochondrial per cell") +
  ## remove legend
  theme(legend.position = "none") +
  ## change appearance
  theme_classic() +
  scale_fill_manual(values="grey") +
  #scale_y_continuous(limits = c(0, 100)) +
  theme(legend.position="none", axis.text.x = element_blank(), axis.ticks.x=element_blank(), text = element_text(size=20), legend.text=element_text(size=20), axis.text=element_text(size=20), axis.text.y=element_text(colour="black"))

## print
v1


## save
#ggsave("~/images_to_export/QC_10X_mito_violin.png", plot = QC_mito_violin, device = "png", path = NULL, scale = 1, width = 15, height = 10, units = "cm", dpi = 300, limitsize = TRUE)

In the Smart-seq2 data, we use a threshold of 20%. No cell in our 10X data is higher than 13% and only

[1] 10

cells have a % mitochondrial reads above 5%.

nGenes filter

plot individual violin plots

## nGenes plot
gene_plot_5k <- VlnPlot(object = pb_sex, features = "nFeature_RNA", pt.size = 0.01)

## improve the aesthetics
gene_plot_5k <- gene_plot_5k + 
  geom_abline(intercept = 200, col="blue") +
  labs(x = "",y = "nGene", title = "Genes per cell") +
  theme_classic() +
  scale_fill_manual(values="grey") +
  scale_y_continuous(limits = c(0, 3000)) +
  theme(legend.position="none", axis.text.x = element_blank(), axis.ticks.x=element_blank(), text = element_text(size=20), legend.text=element_text(size=20), axis.text=element_text(size=20), axis.text.y=element_text(colour="black"))
Scale for 'y' is already present. Adding another scale for 'y', which will replace the
existing scale.
## nUMI plot
numi_plot_5k <- VlnPlot(object = pb_sex, features = "nCount_RNA", pt.size = 0.01)

## improve aesthetics
numi_plot_5k <- numi_plot_5k +
  labs(x = "",y = "nUMI", title = "UMIs per cell") +
  theme_classic() +
  scale_fill_manual(values="grey") +
  scale_y_continuous(limits = c(0, 3000)) +
  theme(legend.position="none", axis.text.x = element_blank(), axis.ticks.x=element_blank(), text = element_text(size=20), legend.text=element_text(size=20), axis.text=element_text(size=20), axis.text.y=element_text(colour="black"))
Scale for 'y' is already present. Adding another scale for 'y', which will replace the
existing scale.
## plot together
grid.arrange(gene_plot_5k, numi_plot_5k, ncol = 2, top=textGrob("5K cells 10X", gp=gpar(fontsize=15,font=8)))


## save nGene plot on its own
#ggsave("ngene_plot.pdf", plot = gene1, device = "pdf", height = 5, width = 5, units = "in", path = "/Users/ar19/Desktop/PhD/GCSKO_Analysis")

plot two metrics together

## make a dataframe for important filtering metrics
df <- data.frame(nCount = log10(pb_sex@meta.data$nCount_RNA), nGenes = pb_sex@meta.data$nFeature_RNA, percent_mt = pb_sex@meta.data$percent.mt, experiment = pb_sex@meta.data$experiment)

## plot main dotplot
plot1 <- ggplot(df, aes(x = nCount, y = nGenes)) + 
  geom_point(aes(), size = 0.1) +
  geom_rug() + 
  scale_y_continuous(name = "Number of Detected Genes") + 
  scale_x_continuous(name = "log10(Number of Total UMI)") + 
  theme_pubr() +
  theme(legend.position = "bottom") +
  geom_hline(yintercept=200)

## plot density plot x
dens1 <- ggplot(df, aes(x = nCount)) + 
  geom_density(alpha = 0.2) + 
  theme_void() + 
  theme(legend.position = "none")

## plot density plot y
## old method
# dens2 <- ggplot(df, aes(x = nGenes, y = experiment)) +
#   geom_density(alpha = 0.2) +
#   theme_void() +
#   theme(legend.position = "none") +
#   coord_flip()
## new method
dens <- density(df$nGenes)
dt <- data.frame(x=dens$x, y=dens$y)
probs <- c(0, 0.25, 0.5, 0.75, 1)
quantiles <- quantile(df$nGenes, prob=probs)
dt$quant <- factor(findInterval(dt$x,quantiles))
dens2 <- ggplot(dt, aes(x,y)) + geom_line() + geom_ribbon(aes(ymin=0, ymax=y, fill=quant)) + scale_x_continuous(breaks=quantiles) + scale_fill_brewer(guide="none") + theme_void() + theme(legend.position = "none") +
  coord_flip()

## plot together
QC_composite_plot <- dens1 + plot_spacer() + plot1 + dens2 + plot_layout(ncol = 2, nrow = 2, widths = c(4, 1), heights = c(1, 4))

## print
QC_composite_plot


## save
ggsave("/Users/Andy/GCSKO/GCSKO_analysis_git/images_to_export/QC_10X_composite_plot.png", plot = QC_composite_plot, device = "png", path = NULL, scale = 1, width = 20, height = 20, units = "cm", dpi = 300, limitsize = TRUE)

Filtering

The threshold used in the malaria cell atlas was 230 for Pb but this is dependent on sequencing depth etc. We can plot the number of cells recovered for a range of thresholds:

paste("original number of cells =", nrow(pb_sex@meta.data))
[1] "original number of cells = 7762"
paste("with >150 filter =", nrow(pb_sex@meta.data[pb_sex@meta.data$nFeature_RNA > 150, ]))
[1] "with >150 filter = 7188"
paste("with >200 filter =", nrow(pb_sex@meta.data[pb_sex@meta.data$nFeature_RNA > 200, ]))
[1] "with >200 filter = 6631"
paste("with >230 filter =", nrow(pb_sex@meta.data[pb_sex@meta.data$nFeature_RNA > 230, ]))
[1] "with >230 filter = 6239"

Since we have already filtered on nUMI, we will filter with 200.

## number of cells before filtering
pb_sex_pre_filter_nCells <- nrow(pb_sex@meta.data)
## filter object
pb_sex <- subset(pb_sex, subset = nFeature_RNA > 200)
## number of cells after filtering
pb_sex_post_filter_nCells <- nrow(pb_sex@meta.data)
## print results of filtering
paste("number of cells pre-filter", pb_sex_pre_filter_nCells)
[1] "number of cells pre-filter 7762"
paste("number of cells post-filter", pb_sex_post_filter_nCells)
[1] "number of cells post-filter 6631"
paste((pb_sex_pre_filter_nCells - pb_sex_post_filter_nCells), "cells were removed by filtering on number of genes.")
[1] "1131 cells were removed by filtering on number of genes."

5. Dimensionality Reduction and Clustering

Prepare data

## normalise object
pb_sex <- NormalizeData(pb_sex, normalization.method = "LogNormalize", scale.factor = 10000)
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
## find variable genes
pb_sex <- FindVariableFeatures(pb_sex, selection.method = "vst", nfeatures = 2000)
Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
## scale the data
all.genes <- rownames(pb_sex)
pb_sex <- ScaleData(pb_sex, features = all.genes)
Centering and scaling data matrix

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |==============                                                                       |  17%
  |                                                                                           
  |============================                                                         |  33%
  |                                                                                           
  |==========================================                                           |  50%
  |                                                                                           
  |=========================================================                            |  67%
  |                                                                                           
  |=======================================================================              |  83%
  |                                                                                           
  |=====================================================================================| 100%

PCA

## run PCA
pb_sex <- RunPCA(pb_sex, features = VariableFeatures(object = pb_sex))
PC_ 1 
Positive:  PBANKA-1235600, PBANKA-0931300, PBANKA-1214300, PBANKA-1400400, PBANKA-1101100, PBANKA-0205000, PBANKA-0311800, PBANKA-0823400, PBANKA-1242300, PBANKA-1246600 
       PBANKA-0722600, PBANKA-1008500, PBANKA-1308600, PBANKA-0932400, PBANKA-1314800, PBANKA-1300096, PBANKA-1309900, PBANKA-1200096, PBANKA-1145400, PBANKA-0722801 
       PBANKA-1100441, PBANKA-1040521, PBANKA-1326400, PBANKA-1101300, PBANKA-0941300, PBANKA-1210200, PBANKA-1303800, PBANKA-0216061, PBANKA-0406500, PBANKA-1127000 
Negative:  PBANKA-1312700, PBANKA-1224900, PBANKA-0824400, PBANKA-1453000, PBANKA-1115200, PBANKA-1432200, PBANKA-1204200, PBANKA-0703500, PBANKA-0702000, PBANKA-0812600 
       PBANKA-0312700, PBANKA-1128800, PBANKA-1218300, PBANKA-1106500, PBANKA-1430600, PBANKA-1449300, PBANKA-0209300, PBANKA-1232600, PBANKA-1432400, PBANKA-1421500 
       PBANKA-1320800, PBANKA-0620400, PBANKA-1038800, PBANKA-1110700, PBANKA-1336200, PBANKA-0417600, PBANKA-1451100, PBANKA-1143800, PBANKA-1414500, PBANKA-0402700 
PC_ 2 
Positive:  PBANKA-1431500, PBANKA-1431400, PBANKA-0942400, PBANKA-1340400, PBANKA-0934700, PBANKA-1038700, PBANKA-0942300, PBANKA-1449000, PBANKA-1208800, PBANKA-0828900 
       PBANKA-0911000, PBANKA-0306100, PBANKA-0927700, PBANKA-0806800, PBANKA-0416100, PBANKA-0507800, PBANKA-1409600, PBANKA-0411300, PBANKA-0406200, PBANKA-0415800 
       PBANKA-1033700, PBANKA-1361500, PBANKA-0925400, PBANKA-0615200, PBANKA-1108800, PBANKA-0507300, PBANKA-0720100, PBANKA-0920700, PBANKA-1030400, PBANKA-0510600 
Negative:  PBANKA-1317200, PBANKA-1352500, PBANKA-0810700, PBANKA-1225500, PBANKA-1319500, PBANKA-0402500, PBANKA-0827400, PBANKA-1419300, PBANKA-0714000, PBANKA-1315300 
       PBANKA-0105100, PBANKA-1436300, PBANKA-1311400, PBANKA-0417600, PBANKA-0821400, PBANKA-0417200, PBANKA-0204500, PBANKA-1342300, PBANKA-0704900, PBANKA-1415200 
       PBANKA-0605800, PBANKA-1115000, PBANKA-0517600, PBANKA-1329900, PBANKA-1134900, PBANKA-1231300, PBANKA-1233600, PBANKA-1035200, PBANKA-0912500, PBANKA-1363600 
PC_ 3 
Positive:  PBANKA-1400600, PBANKA-1459300, PBANKA-0416000, PBANKA-0304800, PBANKA-0305100, PBANKA-1365200, PBANKA-1443300, PBANKA-0830200, PBANKA-0938400, PBANKA-0827200 
       PBANKA-1137800, PBANKA-1437300, PBANKA-0932200, PBANKA-0506100, PBANKA-1349000, PBANKA-0313800, PBANKA-1113300, PBANKA-0409800, PBANKA-1017100, PBANKA-1117000 
       PBANKA-0408500, PBANKA-1450300, PBANKA-0922500, PBANKA-0934300, PBANKA-0505200, PBANKA-0941800, PBANKA-0915200, PBANKA-0722921, PBANKA-0316600, PBANKA-0519400 
Negative:  PBANKA-1431500, PBANKA-1431400, PBANKA-1449000, PBANKA-0806800, PBANKA-0927700, PBANKA-0911000, PBANKA-1108000, PBANKA-0828900, PBANKA-0601200, PBANKA-1208800 
       PBANKA-1409600, PBANKA-0830900, PBANKA-0942400, PBANKA-1029400, PBANKA-1038700, PBANKA-0102700, PBANKA-1316500, PBANKA-1304500, PBANKA-0521200, PBANKA-1333100 
       PBANKA-0902400, PBANKA-1129600, PBANKA-1119800, PBANKA-0822900, PBANKA-1109600, PBANKA-0301800, PBANKA-1429100, PBANKA-0622000, PBANKA-1038200, PBANKA-0942300 
PC_ 4 
Positive:  PBANKA-0107300, PBANKA-1214300, PBANKA-0814200, PBANKA-0713300, PBANKA-0604300, PBANKA-0405200, PBANKA-0109100, PBANKA-0932200, PBANKA-0823400, PBANKA-1460400 
       PBANKA-1448000, PBANKA-0416500, PBANKA-0938400, PBANKA-0406500, PBANKA-1302800, PBANKA-1130500, PBANKA-1437300, PBANKA-1444100, PBANKA-0211500, PBANKA-1419100 
       PBANKA-0710100, PBANKA-1223100, PBANKA-1235600, PBANKA-1450300, PBANKA-0919600, PBANKA-0916200, PBANKA-1242300, PBANKA-1326400, PBANKA-0313800, PBANKA-0818900 
Negative:  PBANKA-0519000, PBANKA-0519100, PBANKA-0519300, PBANKA-0831000, PBANKA-1349100, PBANKA-1344400, PBANKA-1002400, PBANKA-0713100, PBANKA-0523700, PBANKA-0111000 
       PBANKA-0519400, PBANKA-1349000, PBANKA-1032100, PBANKA-1014500, PBANKA-0932000, PBANKA-0804500, PBANKA-1315700, PBANKA-0501400, PBANKA-1425900, PBANKA-0519200 
       PBANKA-1101400, PBANKA-0509600, PBANKA-0619700, PBANKA-1035400, PBANKA-0417800, PBANKA-1210600, PBANKA-1327100, PBANKA-0523800, PBANKA-0307500, PBANKA-1117200 
PC_ 5 
Positive:  PBANKA-0501600, PBANKA-1240600, PBANKA-1002600, PBANKA-0707700, PBANKA-0112200, PBANKA-1002400, PBANKA-0208900, PBANKA-0307500, PBANKA-0812200, PBANKA-1423000 
       PBANKA-0832800, PBANKA-1119600, PBANKA-1409200, PBANKA-0716900, PBANKA-0100400, PBANKA-1364400, PBANKA-1347000, PBANKA-0915000, PBANKA-0819600, PBANKA-1319000 
       PBANKA-1212500, PBANKA-1400091, PBANKA-0900900, PBANKA-1202000, PBANKA-1345900, PBANKA-0311500, PBANKA-1008600, PBANKA-0911700, PBANKA-1349100, PBANKA-0519200 
Negative:  PBANKA-1443300, PBANKA-0925600, PBANKA-0306700, PBANKA-1106500, PBANKA-0907200, PBANKA-0304800, PBANKA-1217400, PBANKA-1107600, PBANKA-1349000, PBANKA-1319300 
       PBANKA-1313100, PBANKA-0312500, PBANKA-0611700, PBANKA-1113300, PBANKA-0208000, PBANKA-1241800, PBANKA-0305100, PBANKA-1438000, PBANKA-1339300, PBANKA-0304400 
       PBANKA-0833300, PBANKA-0206300, PBANKA-0207500, PBANKA-0519400, PBANKA-0715300, PBANKA-1360000, PBANKA-0112100, PBANKA-1340500, PBANKA-0915200, PBANKA-1416800 
## plot 
DimPlot(pb_sex, reduction = "pca")


## elbow plot
ElbowPlot(pb_sex, ndims = 30, reduction = "pca")

UMAP

## run UMAP
pb_sex <- RunUMAP(pb_sex, dims = 1:10, seed.use = 1234, n.neighbors = 150)
16:05:48 UMAP embedding parameters a = 0.9922 b = 1.112
16:05:48 Read 6631 rows and found 10 numeric columns
16:05:48 Using Annoy for neighbor search, n_neighbors = 150
16:05:48 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:05:51 Writing NN index file to temp file /var/folders/wj/rztzclxn1t10cl2sk0plbf3r0000gn/T//RtmpyuC0hm/fileef95491096fc
16:05:51 Searching Annoy index using 1 thread, search_k = 15000
16:06:03 Annoy recall = 100%
16:06:05 Commencing smooth kNN distance calibration using 1 thread
16:06:09 Initializing from normalized Laplacian + noise
16:06:11 Commencing optimization for 500 epochs, with 1163176 positive edges
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:06:31 Optimization finished
## plot
DimPlot(pb_sex, reduction = "umap", group.by = "ident", label = TRUE)


## These are the parameters used in the merge UMAP
#pb_sex <- RunUMAP(pb_sex, reduction = "pca", dims = 1:10, n.neighbors = 150, seed.use = 1234, min.dist = 0.4, repulsion.strength = 0.03, local.connectivity = 150)
#DimPlot(pb_sex, reduction = "umap", group.by = "ident", label = TRUE)

colour with a few marker genes:

# PBANKA-0515000 - p25 - female
# PBANKA-1212600 - HAP2 - male
# PBANKA-0600600 - NEK3 - male
# PBANKA-0831000 - MSP1 - late asexual
# PBANKA-1315700 - RON2 - (asexuals and some male?)
# PBANKA-0416100 - dynenin heavy chain - male - used in 820 line
# PBANKA-1319500 - CCP2 - female - used in 820 line 
# PBANKA-1437500 - AP2-G - seuxal commitment gene
# PBANKA-1102200 - MSP8 - early asexual (from Bozdech paper)

FeaturePlot(pb_sex, features = c("PBANKA-0515000", "PBANKA-1212600","PBANKA-0600600", "PBANKA-0831000", "PBANKA-1315700", "PBANKA-0416100", "PBANKA-1319500", "PBANKA-1437500", "PBANKA-1102200"))

Clustering

pb_sex <- FindNeighbors(pb_sex, dims = 1:21)
Computing nearest neighbor graph
Computing SNN
pb_sex <- FindClusters(pb_sex, resolution = 1)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 6631
Number of edges: 277150

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8104
Number of communities: 14
Elapsed time: 1 seconds

6. Remove Doublets

DoubletFinder

DoubletFinder function

## DoubletFinder should be able to be installed and run as so:
#devtools::install_github('chris-mcginnis-ucsf/DoubletFinder')
#library(doubletFinder) #allows removal of doublets
## but there seems to be some problems with this so we will run it from the github code

## the doublet finder function
doubletFinder_v3 <- function(seu, PCs, pN = 0.25, pK, nExp, reuse.pANN = FALSE, sct = FALSE) {
  require(Seurat); require(fields); require(KernSmooth)

  ## Generate new list of doublet classificatons from existing pANN vector to save time
  if (reuse.pANN != FALSE ) {
    pANN.old <- seu@meta.data[ , reuse.pANN]
    classifications <- rep("Singlet", length(pANN.old))
    classifications[order(pANN.old, decreasing=TRUE)[1:nExp]] <- "Doublet"
    seu@meta.data[, paste("DF.classifications",pN,pK,nExp,sep="_")] <- classifications
    return(seu)
  }

  if (reuse.pANN == FALSE) {
    ## Make merged real-artifical data
    real.cells <- rownames(seu@meta.data)
    data <- seu@assays$RNA@counts[, real.cells]
    n_real.cells <- length(real.cells)
    n_doublets <- round(n_real.cells/(1 - pN) - n_real.cells)
    print(paste("Creating",n_doublets,"artificial doublets...",sep=" "))
    real.cells1 <- sample(real.cells, n_doublets, replace = TRUE)
    real.cells2 <- sample(real.cells, n_doublets, replace = TRUE)
    doublets <- (data[, real.cells1] + data[, real.cells2])/2
    colnames(doublets) <- paste("X", 1:n_doublets, sep = "")
    data_wdoublets <- cbind(data, doublets)

    ## Store important pre-processing information
    orig.commands <- seu@commands

    ## Pre-process Seurat object
    if (sct == FALSE) {
      print("Creating Seurat object...")
      seu_wdoublets <- CreateSeuratObject(counts = data_wdoublets)

      print("Normalizing Seurat object...")
      seu_wdoublets <- NormalizeData(seu_wdoublets,
                                     normalization.method = orig.commands$NormalizeData.RNA@params$normalization.method,
                                     scale.factor = orig.commands$NormalizeData.RNA@params$scale.factor,
                                     margin = orig.commands$NormalizeData.RNA@params$margin)

      print("Finding variable genes...")
      seu_wdoublets <- FindVariableFeatures(seu_wdoublets,
                                            selection.method = orig.commands$FindVariableFeatures.RNA$selection.method,
                                            loess.span = orig.commands$FindVariableFeatures.RNA$loess.span,
                                            clip.max = orig.commands$FindVariableFeatures.RNA$clip.max,
                                            mean.function = orig.commands$FindVariableFeatures.RNA$mean.function,
                                            dispersion.function = orig.commands$FindVariableFeatures.RNA$dispersion.function,
                                            num.bin = orig.commands$FindVariableFeatures.RNA$num.bin,
                                            binning.method = orig.commands$FindVariableFeatures.RNA$binning.method,
                                            nfeatures = orig.commands$FindVariableFeatures.RNA$nfeatures,
                                            mean.cutoff = orig.commands$FindVariableFeatures.RNA$mean.cutoff,
                                            dispersion.cutoff = orig.commands$FindVariableFeatures.RNA$dispersion.cutoff)

      print("Scaling data...")
      seu_wdoublets <- ScaleData(seu_wdoublets,
                                 features = orig.commands$ScaleData.RNA$features,
                                 model.use = orig.commands$ScaleData.RNA$model.use,
                                 do.scale = orig.commands$ScaleData.RNA$do.scale,
                                 do.center = orig.commands$ScaleData.RNA$do.center,
                                 scale.max = orig.commands$ScaleData.RNA$scale.max,
                                 block.size = orig.commands$ScaleData.RNA$block.size,
                                 min.cells.to.block = orig.commands$ScaleData.RNA$min.cells.to.block)

      print("Running PCA...")
      seu_wdoublets <- RunPCA(seu_wdoublets,
                              features = orig.commands$ScaleData.RNA$features,
                              npcs = length(PCs),
                              rev.pca =  orig.commands$RunPCA.RNA$rev.pca,
                              weight.by.var = orig.commands$RunPCA.RNA$weight.by.var,
                              verbose=FALSE)
      pca.coord <- seu_wdoublets@reductions$pca@cell.embeddings[ , PCs]
      cell.names <- rownames(seu_wdoublets@meta.data)
      nCells <- length(cell.names)
      rm(seu_wdoublets); gc() # Free up memory
    }

    if (sct == TRUE) {
      require(sctransform)
      print("Creating Seurat object...")
      seu_wdoublets <- CreateSeuratObject(counts = data_wdoublets)

      print("Running SCTransform...")
      seu_wdoublets <- SCTransform(seu_wdoublets)

      print("Running PCA...")
      seu_wdoublets <- RunPCA(seu_wdoublets, npcs = length(PCs))
      pca.coord <- seu_wdoublets@reductions$pca@cell.embeddings[ , PCs]
      cell.names <- rownames(seu_wdoublets@meta.data)
      nCells <- length(cell.names)
      rm(seu_wdoublets); gc()
    }

    ## Compute PC distance matrix
    print("Calculating PC distance matrix...")
    dist.mat <- fields::rdist(pca.coord)

    ## Compute pANN
    print("Computing pANN...")
    pANN <- as.data.frame(matrix(0L, nrow = n_real.cells, ncol = 1))
    rownames(pANN) <- real.cells
    colnames(pANN) <- "pANN"
    k <- round(nCells * pK)
    for (i in 1:n_real.cells) {
      neighbors <- order(dist.mat[, i])
      neighbors <- neighbors[2:(k + 1)]
      neighbor.names <- rownames(dist.mat)[neighbors]
      pANN$pANN[i] <- length(which(neighbors > n_real.cells))/k
    }

    print("Classifying doublets..")
    classifications <- rep("Singlet",n_real.cells)
    classifications[order(pANN$pANN[1:n_real.cells], decreasing=TRUE)[1:nExp]] <- "Doublet"
    seu@meta.data[, paste("pANN",pN,pK,nExp,sep="_")] <- pANN[rownames(seu@meta.data), 1]
    seu@meta.data[, paste("DF.classifications",pN,pK,nExp,sep="_")] <- classifications
    return(seu)
  }
}
## usage: https://rdrr.io/github/chris-mcginnis-ucsf/DoubletFinder/man/doubletFinder_v3.html
## source: https://github.com/chris-mcginnis-ucsf/DoubletFinder/blob/master/R/doubletFinder_v3.R

Run DoubletFinder

# the tutorial recommends using this as an approximation:
#nExp_poi <- round(0.15*nrow(pb_sex@meta.data))
#but a more appropriate approximation is that the expected number of doublets is ~1% per 1000 cells so:
nExp_poi <- round((0.01*(nrow(pb_sex@meta.data)/1000))*nrow(pb_sex@meta.data))
#run doublet finder:
pb_sex <- doubletFinder_v3(pb_sex, PCs = 1:21, pN = 0.25, pK = 0.01, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)
[1] "Creating 2210 artificial doublets..."
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |==============                                                                       |  17%
  |                                                                                           
  |============================                                                         |  33%
  |                                                                                           
  |==========================================                                           |  50%
  |                                                                                           
  |=========================================================                            |  67%
  |                                                                                           
  |=======================================================================              |  83%
  |                                                                                           
  |=====================================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Computing pANN..."
[1] "Classifying doublets.."

results in:

table(pb_sex@meta.data$DF.classifications_0.25_0.01_440)

Doublet Singlet 
    440    6191 

Validation and visualisation of doublets

visualise where doublets are:

doublet.cells <- c(rownames(pb_sex@meta.data[pb_sex@meta.data$DF.classifications_0.25_0.01_440 == "Doublet",]))
d1 <- DimPlot(pb_sex, reduction = "umap", cells.highlight = doublet.cells, sizes.highlight = 2)
#TSNEPlot(object = pb_sex, cells.highlight = doublet.cells, do.return = TRUE, )
doublet1 <- d1 + coord_fixed() + theme(axis.text.x=element_blank())

## plot clusters
cluster_plot <- DimPlot(pb_sex, reduction = "umap", group.by = "ident", label = TRUE)

doublet1 + cluster_plot

VlnPlot(object = pb_sex, features = "pANN_0.25_0.01_440", pt.size = 0.1)

## extract meta data cols of interest
df <- pb_sex@meta.data[,c("RNA_snn_res.1","DF.classifications_0.25_0.01_440")]

## make a table from doublets
df <- data.frame(rbind(table(df)))

## ammend rownames
df$cluster <- rownames(df)

## calculate percentages of cells that are doublets
df$pc_doublet <- ((df[,1])/((df[,1]) + df[,2]))*100

## inspect
#kable(df[order(df$pc_doublet),])

## plot 
ggplot(data=df, aes(x=cluster, y=pc_doublet)) +
  geom_col(fill="steelblue") +
  theme_minimal()

Filter doublets

It definitely seems like there are some biases in doublet detection. Fewer doublets in very early rings and in mature sexes may be due to a smaller number of the population being these cells.

It may also be biological, that these cells are less likely to associate to one another (although less likely, as doublets are a result of the probability of two cells being captured inside the same droplet together at a certain loading concentration, rather than two cells already being together upon droplet capture).

remove doublets:

## make a list of singlet cells
keep_singlets <- rownames(pb_sex@meta.data[pb_sex@meta.data$DF.classifications_0.25_0.01_440 == "Singlet",])

## subset into new seurat object
pb_sex_filtered <- subset(pb_sex, cells = keep_singlets, subset.raw = TRUE)
The following arguments are not used: subset.raw
## compare old and new objects
pb_sex
An object of class Seurat 
5098 features across 6631 samples within 1 assay 
Active assay: RNA (5098 features, 2000 variable features)
 2 dimensional reductions calculated: pca, umap
pb_sex_filtered
An object of class Seurat 
5098 features across 6191 samples within 1 assay 
Active assay: RNA (5098 features, 2000 variable features)
 2 dimensional reductions calculated: pca, umap

7. Life Cycle Stage (Using Bulk RNA-Seq Correlation)

Add in bulk data predictions

hoo et al.

#Pb Prediction correlations with bulk data (asexual hoo): 

#Load in required package:
library(Hmisc)
#Cooerce expression data into a matrix and load in the reference timecourse data:
x10 <- as.matrix(pb_sex_filtered@assays$RNA@data)
rownames(x10) <- gsub("-", "_", rownames(x10))
#read in bulk data:
hoo<-as.matrix(read.table("/Users/Andy/GCSKO/GCSKO_analysis_git/data/Reference/hoo_berg2.txt",header=T, row.names=1))
#Make a blank dataframe in which to add prediction:
df <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(df) <- c("Prediction(Spearman)","r(Spearman)","Prediction(Pearsons)","r(Pearsons)")
#Do correlations with bulk data using both Spearman and Pearson (and the top 1000 genes):
for (i in 1:ncol(x10))
{
  shared<-intersect(row.names(as.matrix(head(sort(x10[,i], decreasing=TRUE),1000))),row.names(hoo))
  step0<-rcorr(x10[shared,i],hoo[shared,1:12],type = "spearman")
  step1<-as.matrix(t(step0$r[2:13,1]))
  step2<-rcorr(x10[shared,i],hoo[shared,1:12],type = "pearson")
  step3<-as.matrix(t(step2$r[2:13,1]))
  step4<-cbind(colnames(step1)[which.max(step1)],step1[which.max(step1)],colnames(step3)[which.max(step3)],step3[which.max(step3)])
  colnames(step4) <- c("Prediction(Spearman)","r(Spearman)","Prediction(Pearsons)","r(Pearsons)")
  rownames(step4)<-colnames(x10)[i]
  df<-rbind(df,step4)
}
#Write out data into a csv file:
#write.csv(dfringr,file="/Users/ar19/Desktop/PhD/AR04_GCSKO_project/All_mutants_Feb_2018/predictionpbcombined.csv")
#Change the format of the output to make it more readable:
#gsub("Pb_","", dfringr[,1]) - Make predictions into 18hr.dat format:

#spearman:
df[,1] <- gsub("Pb_","", df[,1])
#Remove hr.dat from list:
df[,1] <- gsub("hr.dat","", df[,1])
#Check - dfringr[,1]
#Make into a number:
df[,1] <- as.numeric(df[,1])
df[,2] <- as.numeric(as.character(df[,2]))

#pearson:
df[,3] <- gsub("Pb_","", df[,3])
#Remove hr.dat from list:
df[,3] <- gsub("hr.dat","", df[,3])
#Check - dfringr[,1]
#Make into a number:
df[,3] <- as.numeric(df[,3])
df[,4] <- as.numeric(as.character(df[,4]))
#add to 10X object:
pb_sex_filtered <- AddMetaData(pb_sex_filtered, metadata = df)
Invalid name supplied, making object name syntactically valid. New object name is Prediction.Spearman.r.Spearman.Prediction.Pearsons.r.Pearsons.; see ?make.names for more details on syntax validity

Kasia’s data

Can also do with Kasia’s timecourse data:

kas<-as.matrix(read.table("/Users/Andy/GCSKO/GCSKO_analysis_git/data/Reference/AP2OETC.txt",header=T, row.names=1))
#Make a blank dataframe in which to add prediction:
dfs <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(dfs) <- c("ID","Prediction","r (Pearson)")
#Do correlations with bulk data using both Spearman and Pearson (and the top 1000 genes):
for (i in 1:ncol(x10))
{
  shared<-intersect(row.names(as.matrix(head(sort(x10[,i], decreasing=TRUE),1000))),rownames(kas))
  step0<-rcorr(x10[shared,i],kas[shared,1:10],type = "spearman")
  step1<-as.matrix(t(step0$r[2:11,1]))
  step2<-rcorr(x10[shared,i],kas[shared,1:10],type = "pearson")
  step3<-as.matrix(t(step2$r[2:11,1]))
  step4<-cbind(colnames(step1)[which.max(step1)],step1[which.max(step1)],colnames(step3)[which.max(step3)],step3[which.max(step3)])
  colnames(step4) <- c("Prediction(Spearman)","r(Spearman)","Prediction(Pearsons)","r(Pearsons)")
  rownames(step4)<-colnames(x10)[i]
  dfs<-rbind(dfs,step4)
}
#Write out data into a csv file:
#write.csv(df,file="/Users/ar19/Desktop/PhD/AR04_GCSKO_project/All_mutants_Feb_2018/predictionkasiacombined.csv")

#Change the format of the output to make it more readable:
#gsub("Pb_","", dfs[,1]) - Make predictions into 18hr.dat format:
dfs[,1] <- gsub("X","", dfs[,1])
#Make into a number:
dfs[,1] <- as.numeric(dfs[,1])
#Make into a number:
dfs[,2] <- as.numeric(as.character(dfs[,2]))

#gsub("Pb_","", dfs[,1]) - Make predictions into 18hr.dat format:
dfs[,3] <- gsub("X","", dfs[,3])
#Make into a number:
dfs[,3] <- as.numeric(dfs[,3])
#dfs[,1]
#Make into a number:
dfs[,4] <- as.numeric(as.character(dfs[,4]))

colnames(dfs) <- c('Prediction(Spearman)_Kasia', 'r(Spearman)_Kasia', 'Prediction(Pearson)_Kasia', 'r(Pearson)_Kasia')
#add to Seurat:
#add to 10X object:
pb_sex_filtered <- AddMetaData(pb_sex_filtered, dfs)
Invalid name supplied, making object name syntactically valid. New object name is Prediction.Spearman._Kasiar.Spearman._KasiaPrediction.Pearson._Kasiar.Pearson._Kasia; see ?make.names for more details on syntax validity

Visualise

Confirm life cycle designations:

## plot
FeaturePlot(pb_sex_filtered, features = c("Prediction.Spearman._Kasia", "Prediction.Spearman."))

optimse UMAP

## PCA calc
pb_sex_filtered <- RunPCA(pb_sex_filtered, features = VariableFeatures(object = pb_sex_filtered))
The following 5 features requested have zero variance (running reduction without them): PBANKA-0612861, PBANKA-0836921, PBANKA-1000041, PBANKA-API0290, PBANKA-API0031PC_ 1 
Positive:  PBANKA-1235600, PBANKA-0931300, PBANKA-1214300, PBANKA-1400400, PBANKA-1101100, PBANKA-0205000, PBANKA-0311800, PBANKA-0823400, PBANKA-1242300, PBANKA-1246600 
       PBANKA-0722600, PBANKA-1008500, PBANKA-0932400, PBANKA-1308600, PBANKA-1314800, PBANKA-1309900, PBANKA-1300096, PBANKA-1145400, PBANKA-1200096, PBANKA-1100441 
       PBANKA-0722801, PBANKA-1101300, PBANKA-1326400, PBANKA-1040521, PBANKA-0941300, PBANKA-1127000, PBANKA-1303800, PBANKA-0406500, PBANKA-1210200, PBANKA-0814200 
Negative:  PBANKA-1312700, PBANKA-1224900, PBANKA-0824400, PBANKA-1115200, PBANKA-1453000, PBANKA-1432200, PBANKA-1204200, PBANKA-0703500, PBANKA-0702000, PBANKA-0812600 
       PBANKA-0312700, PBANKA-1128800, PBANKA-1218300, PBANKA-1232600, PBANKA-1432400, PBANKA-1430600, PBANKA-1449300, PBANKA-1106500, PBANKA-0209300, PBANKA-0620400 
       PBANKA-1421500, PBANKA-1336200, PBANKA-1320800, PBANKA-0417600, PBANKA-1038800, PBANKA-1129800, PBANKA-1414500, PBANKA-1451100, PBANKA-1143800, PBANKA-0402700 
PC_ 2 
Positive:  PBANKA-1431500, PBANKA-1431400, PBANKA-0942400, PBANKA-1340400, PBANKA-1449000, PBANKA-1038700, PBANKA-0942300, PBANKA-0934700, PBANKA-1208800, PBANKA-0828900 
       PBANKA-0911000, PBANKA-0306100, PBANKA-0927700, PBANKA-0806800, PBANKA-1409600, PBANKA-0406200, PBANKA-0411300, PBANKA-0415800, PBANKA-0507800, PBANKA-0416100 
       PBANKA-1033700, PBANKA-1361500, PBANKA-1108800, PBANKA-0920700, PBANKA-0510600, PBANKA-0925400, PBANKA-0720100, PBANKA-0507300, PBANKA-1119800, PBANKA-1030400 
Negative:  PBANKA-1317200, PBANKA-1352500, PBANKA-0827400, PBANKA-0810700, PBANKA-1225500, PBANKA-1319500, PBANKA-0402500, PBANKA-1419300, PBANKA-0714000, PBANKA-1315300 
       PBANKA-1436300, PBANKA-0105100, PBANKA-1311400, PBANKA-0821400, PBANKA-0417600, PBANKA-0204500, PBANKA-1342300, PBANKA-0417200, PBANKA-0605800, PBANKA-0704900 
       PBANKA-0517600, PBANKA-1231300, PBANKA-1329900, PBANKA-1415200, PBANKA-1115000, PBANKA-1134900, PBANKA-1035200, PBANKA-1233600, PBANKA-0907100, PBANKA-0912500 
PC_ 3 
Positive:  PBANKA-1400600, PBANKA-1459300, PBANKA-0416000, PBANKA-0305100, PBANKA-0304800, PBANKA-1443300, PBANKA-1365200, PBANKA-0830200, PBANKA-0938400, PBANKA-1349000 
       PBANKA-0827200, PBANKA-1137800, PBANKA-1437300, PBANKA-1113300, PBANKA-0932200, PBANKA-0506100, PBANKA-1017100, PBANKA-0313800, PBANKA-0408500, PBANKA-1117000 
       PBANKA-0409800, PBANKA-0922500, PBANKA-0519400, PBANKA-0925600, PBANKA-0316600, PBANKA-0915200, PBANKA-0722921, PBANKA-0934300, PBANKA-0941800, PBANKA-1225000 
Negative:  PBANKA-1431500, PBANKA-1431400, PBANKA-1449000, PBANKA-0806800, PBANKA-1108000, PBANKA-0927700, PBANKA-0911000, PBANKA-0828900, PBANKA-0601200, PBANKA-1208800 
       PBANKA-1409600, PBANKA-0830900, PBANKA-0521200, PBANKA-1029400, PBANKA-0102700, PBANKA-0942400, PBANKA-1304500, PBANKA-1316500, PBANKA-1038700, PBANKA-1333100 
       PBANKA-0902400, PBANKA-1109600, PBANKA-1129600, PBANKA-1119800, PBANKA-0822900, PBANKA-0719700, PBANKA-1429100, PBANKA-1038200, PBANKA-0301800, PBANKA-1320100 
PC_ 4 
Positive:  PBANKA-0519000, PBANKA-1349100, PBANKA-0519100, PBANKA-0519300, PBANKA-1344400, PBANKA-0831000, PBANKA-1002400, PBANKA-0713100, PBANKA-0523700, PBANKA-0111000 
       PBANKA-1032100, PBANKA-1014500, PBANKA-0519400, PBANKA-0932000, PBANKA-0804500, PBANKA-1349000, PBANKA-1315700, PBANKA-0501400, PBANKA-1101400, PBANKA-1425900 
       PBANKA-0519200, PBANKA-1210600, PBANKA-0509600, PBANKA-0619700, PBANKA-1327100, PBANKA-0523800, PBANKA-0307500, PBANKA-1035400, PBANKA-1119600, PBANKA-1117200 
Negative:  PBANKA-0107300, PBANKA-0814200, PBANKA-1214300, PBANKA-0713300, PBANKA-0604300, PBANKA-0405200, PBANKA-0932200, PBANKA-0109100, PBANKA-0823400, PBANKA-0938400 
       PBANKA-0416500, PBANKA-1460400, PBANKA-1437300, PBANKA-1302800, PBANKA-1130500, PBANKA-0406500, PBANKA-1448000, PBANKA-0211500, PBANKA-1444100, PBANKA-1450300 
       PBANKA-0710100, PBANKA-1223100, PBANKA-1419100, PBANKA-0313800, PBANKA-0818900, PBANKA-0919600, PBANKA-1235600, PBANKA-1242300, PBANKA-0916200, PBANKA-1203700 
PC_ 5 
Positive:  PBANKA-0501600, PBANKA-1240600, PBANKA-0707700, PBANKA-0112200, PBANKA-1002600, PBANKA-0208900, PBANKA-0812200, PBANKA-1423000, PBANKA-0832800, PBANKA-1409200 
       PBANKA-0307500, PBANKA-0716900, PBANKA-0100400, PBANKA-1002400, PBANKA-0915000, PBANKA-1364400, PBANKA-1119600, PBANKA-1347000, PBANKA-0819600, PBANKA-1319000 
       PBANKA-1212500, PBANKA-0900900, PBANKA-1202000, PBANKA-1400091, PBANKA-1228800, PBANKA-0911700, PBANKA-0311500, PBANKA-1345900, PBANKA-1025300, PBANKA-0519200 
Negative:  PBANKA-1443300, PBANKA-0925600, PBANKA-0306700, PBANKA-0907200, PBANKA-1349000, PBANKA-1106500, PBANKA-1217400, PBANKA-1107600, PBANKA-1319300, PBANKA-0304800 
       PBANKA-0519400, PBANKA-0312500, PBANKA-1313100, PBANKA-0305100, PBANKA-0112100, PBANKA-0611700, PBANKA-1241800, PBANKA-0304400, PBANKA-0833300, PBANKA-0208000 
       PBANKA-1113300, PBANKA-1339300, PBANKA-1438000, PBANKA-0206300, PBANKA-1340500, PBANKA-0207500, PBANKA-0915200, PBANKA-0808000, PBANKA-1020100, PBANKA-1360000 
## elbow plot
ElbowPlot(pb_sex_filtered, ndims = 30, reduction = "pca")


## UMAP calc
pb_sex_filtered <- RunUMAP(pb_sex_filtered, dims = 1:8, seed.use = 300, n.neighbors = 60, min.dist = 0.5, repulsion.strength = 0.05, local.connectivity = 20)
16:11:27 UMAP embedding parameters a = 0.583 b = 1.334
16:11:27 Read 6191 rows and found 8 numeric columns
16:11:27 Using Annoy for neighbor search, n_neighbors = 60
16:11:27 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:11:29 Writing NN index file to temp file /var/folders/wj/rztzclxn1t10cl2sk0plbf3r0000gn/T//RtmpyuC0hm/fileef9560289fdd
16:11:30 Searching Annoy index using 1 thread, search_k = 6000
16:11:36 Annoy recall = 100%
16:11:39 Commencing smooth kNN distance calibration using 1 thread
16:11:39 6191 smooth knn distance failures
16:11:43 Initializing from normalized Laplacian + noise
16:11:45 Commencing optimization for 500 epochs, with 184004 positive edges
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:12:22 Optimization finished
## UMAP plot
DimPlot(pb_sex_filtered, reduction = "umap", group.by = "ident", label = TRUE)

8. Save and Export

Save environment

## This saves everything in the global environment for easy recall later
#save.image(file = "GCSKO_10X_QC.RData")
#load(file = "GCSKO_10X_QC.RData")

Save object(s)

## save Rdata
#save(pb_30k_sex_filtered, pb_sex_filtered, file = "Part_2_input.Rdata")
#load(file = "Part_2_input.Rdata")

## save RDS
saveRDS(pb_sex_filtered, file = "/Users/Andy/GCSKO/GCSKO_analysis_git/data_to_export/pb_sex_filtered.RDS", compress = FALSE) 
#pb_sex_filtered <- readRDS("pb_sex_filtered.RDS")

## Save Robj
#save(pb_sex_filtered,file="pb_sex_filtered.Robj")

Appendix

Session Info

R version 4.0.2 (2020-06-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.6

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib

locale:
[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] viridis_0.5.1      viridisLite_0.3.0  KernSmooth_2.23-17 fields_10.3       
 [5] maps_3.3.0         spam_2.5-1         dotCall64_1.0-0    ggridges_0.5.2    
 [9] patchwork_1.0.1    ggpubr_0.4.0       scales_1.1.1       mixtools_1.2.0    
[13] dplyr_1.0.0        Hmisc_4.4-0        ggplot2_3.3.2      Formula_1.2-3     
[17] survival_3.2-3     lattice_0.20-41    gridExtra_2.3      cowplot_1.0.0     
[21] Seurat_3.2.0       knitr_1.29        

loaded via a namespace (and not attached):
  [1] readxl_1.3.1          backports_1.1.8       plyr_1.8.6            igraph_1.2.5         
  [5] lazyeval_0.2.2        splines_4.0.2         listenv_0.8.0         usethis_1.6.1        
  [9] digest_0.6.25         htmltools_0.5.0       fansi_0.4.1           magrittr_1.5         
 [13] checkmate_2.0.0       memoise_1.1.0         tensor_1.5            cluster_2.1.0        
 [17] ROCR_1.0-11           openxlsx_4.1.5        remotes_2.1.1         globals_0.12.5       
 [21] prettyunits_1.1.1     jpeg_0.1-8.1          colorspace_1.4-1      ggrepel_0.8.2        
 [25] haven_2.3.1           xfun_0.15             callr_3.4.3           crayon_1.3.4         
 [29] jsonlite_1.7.0        spatstat_1.64-1       spatstat.data_1.4-3   zoo_1.8-8            
 [33] ape_5.4               glue_1.4.1            polyclip_1.10-0       gtable_0.3.0         
 [37] leiden_0.3.3          car_3.0-8             pkgbuild_1.1.0        kernlab_0.9-29       
 [41] future.apply_1.6.0    abind_1.4-5           rstatix_0.6.0         miniUI_0.1.1.1       
 [45] Rcpp_1.0.5            xtable_1.8-4          htmlTable_2.0.1       reticulate_1.16      
 [49] foreign_0.8-80        rsvd_1.0.3            htmlwidgets_1.5.1     httr_1.4.2           
 [53] RColorBrewer_1.1-2    acepack_1.4.1         ellipsis_0.3.1        ica_1.0-2            
 [57] farver_2.0.3          pkgconfig_2.0.3       nnet_7.3-14           uwot_0.1.8           
 [61] deldir_0.1-28         labeling_0.3          tidyselect_1.1.0      rlang_0.4.7          
 [65] reshape2_1.4.4        later_1.1.0.1         cellranger_1.1.0      munsell_0.5.0        
 [69] tools_4.0.2           cli_2.0.2             generics_0.0.2        broom_0.7.0          
 [73] devtools_2.3.0        evaluate_0.14         stringr_1.4.0         fastmap_1.0.1        
 [77] yaml_2.2.1            goftest_1.2-2         processx_3.4.3        fs_1.4.2             
 [81] fitdistrplus_1.1-1    zip_2.1.0             purrr_0.3.4           RANN_2.6.1           
 [85] pbapply_1.4-2         future_1.18.0         nlme_3.1-148          mime_0.9             
 [89] compiler_4.0.2        rstudioapi_0.11       curl_4.3              plotly_4.9.2.1       
 [93] png_0.1-7             ggsignif_0.6.0        testthat_2.3.2        spatstat.utils_1.17-0
 [97] tibble_3.0.3          stringi_1.4.6         highr_0.8             ps_1.3.3             
[101] RSpectra_0.16-0       desc_1.2.0            forcats_0.5.0         Matrix_1.2-18        
[105] vctrs_0.3.2           pillar_1.4.6          lifecycle_0.2.0       lmtest_0.9-37        
[109] RcppAnnoy_0.0.16      data.table_1.12.8     irlba_2.3.3           httpuv_1.5.4         
[113] R6_2.4.1              latticeExtra_0.6-29   promises_1.1.1        rio_0.5.16           
[117] sessioninfo_1.1.1     codetools_0.2-16      MASS_7.3-51.6         assertthat_0.2.1     
[121] pkgload_1.1.0         rprojroot_1.3-2       withr_2.2.0           sctransform_0.2.1    
[125] hms_0.5.3             mgcv_1.8-31           parallel_4.0.2        rpart_4.1-15         
[129] tidyr_1.1.0           rmarkdown_2.3         carData_3.0-4         segmented_1.2-0      
[133] Rtsne_0.15            shiny_1.5.0           base64enc_0.1-3       tinytex_0.25         
---
subtitle: 'Gametocyte Development in <i>Plasmodium berghei</i>'
title: |
  ![](/Users/Andy/GCSKO/GCSKO_analysis_git/GCSKO_logo.jpg){width=300px}  
  10X Quality Control
author: "[Andrew Russell](https://ajcrussell.wixsite.com/mysite/about)"
institute: Wellcome Sanger Institute
date: '`r format(Sys.Date(), "%B %d, %Y")`'
output:
  html_notebook:
    theme: cosmo
    toc: yes
    toc_depth: 3
    #toc_float: yes
    df_print: paged
---
***
# 1. Introduction and Aims {.tabset}

Two datasets were generated using the 10X Genomics Chromium 3' scRNA-Seq platform:

```{r intro, echo=FALSE, results='asis'}
## load knitr to display table
library(knitr)
## make dataframe
species <- c("pb", "pb")
experiment_name <- c('straight bleed experiment','1:1 mix experiment')
run_number <- c("22252", "24284")
lane_number <- c("5", "1 & 2")
sequencer <- c("Hiseq 4000", "Hiseq 2500")
approximate_number_of_cells <- c("30,000", "5,000")
employ.data <- data.frame(species, experiment_name, run_number, lane_number, sequencer, approximate_number_of_cells, stringsAsFactors=FALSE)
## print dataframe
kable(employ.data)
```

This data has been processed using CellRanger into counts tables. This initial analysis gave the following metrics:

<b>Pb 1:1 mix experiment (run #: 24284 lanes 1 and 2 (Hiseq 2500)):</b>

<div style="width:500px; height:400px">![](/Users/Andy/GCSKO/GCSKO_analysis_git/data/10X/24284_cellranger_output_1.png)</div>
<div style="width:500px; height:200px">![](/Users/Andy/GCSKO/GCSKO_analysis_git/data/10X/24284_cellranger_output_2.png)</div>

<b>We will load this data in and for each run:</b>

A. Define 'cells'

B. Filter poor quality cells out

C. Dimensionality Reduction and Clustering 

D. Remove Doublets

E. Predict life Cycle Stage (Using Bulk RNA-Seq Correlation)

# 2. Read in the data  {.tabset}

### Load the required packages

```{r load packages, echo = FALSE}
## Seurat is needed for most of this script
if(require("Seurat", quietly = TRUE)){
    print("Seurat is loaded correctly")
} else {
    print("trying to install Seurat")
    install.packages("Seurat")
    if(require(Seurat)){
        print("Seurat installed and loaded")
    } else {
        stop("could not install Seurat")
    }
}

## cowplot is needed for plots
if(require("cowplot")){
    print("cowplot is loaded correctly")
} else {
    print("trying to install cowplot")
    install.packages("cowplot")
    if(require(cowplot)){
        print("cowplot installed and loaded")
    } else {
        stop("could not install cowplot")
    }
}

## gridExtra is needed for grid graphics to plot multiple plots in the same view
if(require("gridExtra")){
    print("gridExtra is loaded correctly")
} else {
    print("trying to install gridExtra")
    install.packages("gridExtra")
    if(require(gridExtra)){
        print("gridExtra installed and loaded")
    } else {
        stop("could not install gridExtra")
    }
}

##for grid.arrange function to change size of title
if(require("grid")){
    print("grid is loaded correctly")
} else {
    print("trying to install grid")
    install.packages("grid")
    if(require(grid)){
        print("grid installed and loaded")
    } else {
        stop("could not install grid")
    }
}

## for doing bulk correlation calculations
if(require("Hmisc")){
    print("Hmisc is loaded correctly")
} else {
    print("trying to install Hmisc")
    install.packages("Hmisc")
    if(require(Hmisc)){
        print("Hmisc installed and loaded")
    } else {
        stop("could not install Hmisc")
    }
}

## dplyr is needed to work with data frames
if(require("dplyr")){
    print("dplyr is loaded correctly")
} else {
    print("trying to install dplyr")
    install.packages("dplyr")
    if(require(dplyr)){
        print("dplyr installed and loaded")
    } else {
        stop("could not install dplyr")
    }
}

## scales is needed for break formatting functions in the barcode plot
if(require("scales")){
    print("scales is loaded correctly")
} else {
    print("trying to install scales")
    install.packages("scales")
    if(require(scales)){
        print("scales installed and loaded")
    } else {
        stop("could not install scales")
    }
}

## ggpubr is needed for plotting
if(require("ggpubr", quietly = TRUE)){
    print("ggpubr is loaded correctly")
} else {
    print("trying to install ggpubr")
    install.packages("ggpubr")
    if(require(ggpubr)){
        print("ggpubr installed and loaded")
    } else {
        stop("could not install ggpubr")
    }
}

## patchwork is needed for plotting
## WARNING! cowplot over-rides this by masking it so be careful.
if(require("patchwork", quietly = TRUE)){
    print("patchwork is loaded correctly")
} else {
    print("trying to install patchwork")
    install.packages("patchwork")
    if(require(patchwork)){
        print("patchwork installed and loaded")
    } else {
        stop("could not install patchwork")
    }
}

## set the seed for both the mixture models and also for the sample function later on:
set.seed(-92497)
```

### Import GTF file

This will be helpful later on. This contains annotations for each gene:
```{r import gtf}
##Import gtf file:
gtf <- read.table("/Users/Andy/GCSKO/GCSKO_analysis_git/data/Reference/Pberghei.gtf", sep="\t", header = FALSE)
head(gtf)
```

### Read in the Data

```{r import data}
## read in 10x output 
tenx5k_raw_data <- Read10X("/Users/Andy/GCSKO/GCSKO_analysis_git/data/10X/tenx_24284")

## Create Seurat object
tenx5k <- CreateSeuratObject(counts = tenx5k_raw_data, min.cells = 0, min.features = 0, project = "GCSKO")

## add experiment to meta data for merging later
tenx5k@meta.data$experiment <- "tenx5k"
```

# 3. Defining Cells vs. Background {.tabset}

Plot a knee plot and then use a mixture model to define where the cells vs. background lie
```{r mixmodel setup}
## interesting reference material for this section can be found here: https://hemberg-lab.github.io/scRNA.seq.course/processing-raw-scrna-seq-data.html 

## get the nUMIs
umi_per_barcode <- as.data.frame(tenx5k@meta.data$nCount_RNA)

## remove zeros as these have issues when you log them and make the model later:
umi_per_barcode <- as.data.frame(umi_per_barcode[!(umi_per_barcode$`tenx5k@meta.data$nCount_RNA`==0), ])

## get a rank for each barcode
barcode_rank <- rank(-umi_per_barcode[,1])

## then make into a list
lib_size <- (umi_per_barcode[,1])

## then log this
log_lib_size <- log10(umi_per_barcode[,1])

##plot
#plot(barcode_rank, log_lib_size, xlim=c(1,100000))

## order the barcode ranks
o <- order(barcode_rank)

## reorder the library size, barcode rank by their rank
log_lib_size <- log_lib_size[o]
barcode_rank <- barcode_rank[o]
lib_size <- lib_size[o]
```

make a mixture model to determine the knee of the plot
```{r mixmodel calc}
## set a seed for the mixture model
set.seed(-92497)

## mixture model calculation 
require("mixtools")
mix <- normalmixEM(log_lib_size)

## plot result
plot(mix, which=2, xlab2="log(mol per cell)")
```

Find where the distributions intersect (i.e. where cells vs. background is)
```{r calculate split}
## identify where the split between the distributions is
p1 <- dnorm(log_lib_size, mean=mix$mu[1], sd=mix$sigma[1])
p2 <- dnorm(log_lib_size, mean=mix$mu[2], sd=mix$sigma[2])
if (mix$mu[1] < mix$mu[2]) {
    split <- min(log_lib_size[p2 > p1])
} else {
    split <- min(log_lib_size[p1 > p2])
}

## print split
split
```

View the initial result
```{r}
## log the barcode rank
log_barcode_rank <- log10(barcode_rank)

## plot
plot(log_barcode_rank, log_lib_size, xlim=c(1,6))
## add the split as a line on the plot
abline(h=split, col="red")
```

Final Figures:
```{r barcode plot}
## make the results of the above functions into a dataframe
df_barcodes <- as.data.frame(cbind(barcode_rank, log_lib_size, lib_size), row.names = NULL)

## add a column for if it is a cell or not
df_barcodes$cell = rownames(df_barcodes) %in% which(df_barcodes$log_lib_size > split)

## change value to a numeric
df_barcodes$cell <- as.numeric(df_barcodes$cell)

## change the 0 to a 2 for ease of handling
df_barcodes$cell[df_barcodes$cell<1] <- 2

## rename the numerics into cells or background
df_barcodes$cell[df_barcodes$cell == 1] <- "Cells"
df_barcodes$cell[df_barcodes$cell == 2] <- "Background"

## extract the cutoff for cells do you can plot the lines
boundary <- as.numeric(sum(df_barcodes$cell == "Cells"))
split <- 10^split

## make the plot
barcode_plot <- ggplot(df_barcodes, aes(x=barcode_rank, y=lib_size, colour = cell, theme_size = 40)) +
  ## make into a dot plot
  geom_point(size = 1, shape = 16) +
  ## make the axis into log and specify breaks
  scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
  ## make the axis into log and specify breaks
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
  annotation_logticks() +
  ## change colours of plot
  scale_color_manual(values=c("#bdbdbd", "#5ba43a"), labels = c("Background", "Cells")) +
  ## change aes of legend
  theme(legend.position="bottom", text = element_text(size=25), legend.text=element_text(size=25), axis.text=element_text(size=25)) +
  ## add the lines on the plot
  geom_segment(aes(x = 0, y = split, xend = boundary, yend = split), colour = "black", alpha = 0.01) +
  geom_segment(aes(x = boundary, y = 0, xend = boundary, yend = split), colour = "black", alpha = 0.01) +
  ## change the axis labels
  labs(x = "Barcodes", y = "UMI Counts", colour="Cell Designation") +
  ## change the size of the legend
  guides(colour = guide_legend(override.aes = list(size=10))) +
  ## fix axis
  coord_fixed() +
  ## make it look pretty
  theme_light()

## print the plot
barcode_plot

## save the plot
ggsave("barcode_plot_5k.png", plot = barcode_plot, device = "png", height = 15, width = 15, units = "cm", path = "/Users/Andy/GCSKO/GCSKO_analysis_git/images_to_export/")
```

so the number of cells that is retained is:
```{r, echo = FALSE}
table(df_barcodes$cell)
```

### Filter the object

```{r filter cells vs background}
## extract the nCount and row names from the Seurat object
upb <- data.frame(nCount_RNA = tenx5k@meta.data$nCount_RNA, row.names = rownames(tenx5k@meta.data))

## make blank column for rank
upb$rank <- NA

## order by nCounts
order.scores <- order(upb$nCount_RNA, decreasing = TRUE)

## add a rank to this column based on the ordered nCount
upb$rank[order.scores] <- 1:nrow(upb)

## inspect
#head(upb)

## make a list of cells to retain in the Seurat object
keep_cells <- rownames(upb[which(upb$rank < 7763),])

## subset Seurat object to include cells and discard background
pb_sex <- subset(tenx5k, cells = keep_cells)
```

# 4. Filter Out Poor-Quality Cells {.tabset}

### Filter Mitochondrial %

Mitochondrial cell counts
```{r, fig.height = 15, fig.width = 7}
## extract mitochondrial genes 
#mito_genes <- gtf[which(gtf$V3 == "rRNA"),]$V9
#mito_genes <- gsub(";.*","", gsub("gene_id ", "", mito_genes))
#paste("These are the mitochondrial genes")
#head(mito_genes)

## extract mito genes
mito_genes <- pb_sex@assays$RNA@counts@Dimnames[[1]][grep("^PBANKA-MIT",pb_sex@assays$RNA@counts@Dimnames[[1]])]

## plot the genes individually
VlnPlot(object = pb_sex, features = mito_genes, pt.size = 0.01)

## make a percentage mitocondrial for each cell (this will work as long as you filter cells out with zero counts)
pb_sex <- PercentageFeatureSet(pb_sex, pattern = "^PBANKA-MIT", col.name = "percent.mt")
```

plot percentage mitochondrial
```{r}
## plot for percentage of mitochondrial reads
v1 <- VlnPlot(object = pb_sex, features = "percent.mt", pt.size = 0.01) +
  ## add a line where we will filter
  geom_abline(intercept = 20, col="blue") +
  ## change labels
  labs(x = "",y = "% Mitochondrial Reads", title = "Mitochondrial per cell") +
  ## remove legend
  theme(legend.position = "none") +
  ## change appearance
  theme_classic() +
  scale_fill_manual(values="grey") +
  #scale_y_continuous(limits = c(0, 100)) +
  theme(legend.position="none", axis.text.x = element_blank(), axis.ticks.x=element_blank(), text = element_text(size=20), legend.text=element_text(size=20), axis.text=element_text(size=20), axis.text.y=element_text(colour="black"))

## print
v1

## save
#ggsave("~/images_to_export/QC_10X_mito_violin.png", plot = QC_mito_violin, device = "png", path = NULL, scale = 1, width = 15, height = 10, units = "cm", dpi = 300, limitsize = TRUE)
```

In the Smart-seq2 data, we use a threshold of 20%. No cell in our 10X data is higher than 13% and only
```{r, echo = FALSE}
sum(pb_sex@meta.data$percent.mt > 5)
```
cells have a % mitochondrial reads above 5%. 

### nGenes filter

plot individual violin plots
```{r}
## nGenes plot
gene_plot_5k <- VlnPlot(object = pb_sex, features = "nFeature_RNA", pt.size = 0.01)

## improve the aesthetics
gene_plot_5k <- gene_plot_5k + 
  geom_abline(intercept = 200, col="blue") +
  labs(x = "",y = "nGene", title = "Genes per cell") +
  theme_classic() +
  scale_fill_manual(values="grey") +
  scale_y_continuous(limits = c(0, 3000)) +
  theme(legend.position="none", axis.text.x = element_blank(), axis.ticks.x=element_blank(), text = element_text(size=20), legend.text=element_text(size=20), axis.text=element_text(size=20), axis.text.y=element_text(colour="black"))

## nUMI plot
numi_plot_5k <- VlnPlot(object = pb_sex, features = "nCount_RNA", pt.size = 0.01)

## improve aesthetics
numi_plot_5k <- numi_plot_5k +
  labs(x = "",y = "nUMI", title = "UMIs per cell") +
  theme_classic() +
  scale_fill_manual(values="grey") +
  scale_y_continuous(limits = c(0, 3000)) +
  theme(legend.position="none", axis.text.x = element_blank(), axis.ticks.x=element_blank(), text = element_text(size=20), legend.text=element_text(size=20), axis.text=element_text(size=20), axis.text.y=element_text(colour="black"))

## plot together
grid.arrange(gene_plot_5k, numi_plot_5k, ncol = 2, top=textGrob("5K cells 10X", gp=gpar(fontsize=15,font=8)))

## save nGene plot on its own
#ggsave("ngene_plot.pdf", plot = gene1, device = "pdf", height = 5, width = 5, units = "in", path = "/Users/ar19/Desktop/PhD/GCSKO_Analysis")
```

plot two metrics together
```{r, fig.height = 8, fig.width = 8}
## make a dataframe for important filtering metrics
df <- data.frame(nCount = log10(pb_sex@meta.data$nCount_RNA), nGenes = pb_sex@meta.data$nFeature_RNA, percent_mt = pb_sex@meta.data$percent.mt, experiment = pb_sex@meta.data$experiment)

## plot main dotplot
plot1 <- ggplot(df, aes(x = nCount, y = nGenes)) + 
  geom_point(aes(), size = 0.1) +
  geom_rug() + 
  scale_y_continuous(name = "Number of Detected Genes") + 
  scale_x_continuous(name = "log10(Number of Total UMI)") + 
  theme_pubr() +
  theme(legend.position = "bottom") +
  geom_hline(yintercept=200)

## plot density plot x
dens1 <- ggplot(df, aes(x = nCount)) + 
  geom_density(alpha = 0.2) + 
  theme_void() + 
  theme(legend.position = "none")

## plot density plot y
## old method
# dens2 <- ggplot(df, aes(x = nGenes, y = experiment)) +
#   geom_density(alpha = 0.2) +
#   theme_void() +
#   theme(legend.position = "none") +
#   coord_flip()
## new method
dens <- density(df$nGenes)
dt <- data.frame(x=dens$x, y=dens$y)
probs <- c(0, 0.25, 0.5, 0.75, 1)
quantiles <- quantile(df$nGenes, prob=probs)
dt$quant <- factor(findInterval(dt$x,quantiles))
dens2 <- ggplot(dt, aes(x,y)) + geom_line() + geom_ribbon(aes(ymin=0, ymax=y, fill=quant)) + scale_x_continuous(breaks=quantiles) + scale_fill_brewer(guide="none") + theme_void() + theme(legend.position = "none") +
  coord_flip()

## plot together
QC_composite_plot <- dens1 + plot_spacer() + plot1 + dens2 + plot_layout(ncol = 2, nrow = 2, widths = c(4, 1), heights = c(1, 4))

## print
QC_composite_plot

## save
ggsave("/Users/Andy/GCSKO/GCSKO_analysis_git/images_to_export/QC_10X_composite_plot.png", plot = QC_composite_plot, device = "png", path = NULL, scale = 1, width = 20, height = 20, units = "cm", dpi = 300, limitsize = TRUE)
```

### Filtering

The threshold used in the malaria cell atlas was 230 for Pb but this is dependent on sequencing depth etc.
We can plot the number of cells recovered for a range of thresholds:
```{r}
paste("original number of cells =", nrow(pb_sex@meta.data))
paste("with >150 filter =", nrow(pb_sex@meta.data[pb_sex@meta.data$nFeature_RNA > 150, ]))
paste("with >200 filter =", nrow(pb_sex@meta.data[pb_sex@meta.data$nFeature_RNA > 200, ]))
paste("with >230 filter =", nrow(pb_sex@meta.data[pb_sex@meta.data$nFeature_RNA > 230, ]))
```

Since we have already filtered on nUMI, we will filter with 200.

```{r}
## number of cells before filtering
pb_sex_pre_filter_nCells <- nrow(pb_sex@meta.data)
## filter object
pb_sex <- subset(pb_sex, subset = nFeature_RNA > 200)
## number of cells after filtering
pb_sex_post_filter_nCells <- nrow(pb_sex@meta.data)
## print results of filtering
paste("number of cells pre-filter", pb_sex_pre_filter_nCells)
paste("number of cells post-filter", pb_sex_post_filter_nCells)
paste((pb_sex_pre_filter_nCells - pb_sex_post_filter_nCells), "cells were removed by filtering on number of genes.")
```

# 5. Dimensionality Reduction and Clustering {.tabset}

### Prepare data
```{r}
## normalise object
pb_sex <- NormalizeData(pb_sex, normalization.method = "LogNormalize", scale.factor = 10000)

## find variable genes
pb_sex <- FindVariableFeatures(pb_sex, selection.method = "vst", nfeatures = 2000)

## scale the data
all.genes <- rownames(pb_sex)
pb_sex <- ScaleData(pb_sex, features = all.genes)
```

### PCA
```{r}
## run PCA
pb_sex <- RunPCA(pb_sex, features = VariableFeatures(object = pb_sex))

## plot 
DimPlot(pb_sex, reduction = "pca")

## elbow plot
ElbowPlot(pb_sex, ndims = 30, reduction = "pca")
```

### UMAP
```{r}
## run UMAP
pb_sex <- RunUMAP(pb_sex, dims = 1:10, seed.use = 1234, n.neighbors = 150)

## plot
DimPlot(pb_sex, reduction = "umap", group.by = "ident", label = TRUE)

## These are the parameters used in the merge UMAP
#pb_sex <- RunUMAP(pb_sex, reduction = "pca", dims = 1:10, n.neighbors = 150, seed.use = 1234, min.dist = 0.4, repulsion.strength = 0.03, local.connectivity = 150)
#DimPlot(pb_sex, reduction = "umap", group.by = "ident", label = TRUE)
```

colour with a few marker genes:
```{r, fig.height = 6, fig.width = 8}
# PBANKA-0515000 - p25 - female
# PBANKA-1212600 - HAP2 - male
# PBANKA-0600600 - NEK3 - male
# PBANKA-0831000 - MSP1 - late asexual
# PBANKA-1315700 - RON2 - (asexuals and some male?)
# PBANKA-0416100 - dynenin heavy chain - male - used in 820 line
# PBANKA-1319500 - CCP2 - female - used in 820 line 
# PBANKA-1437500 - AP2-G - seuxal commitment gene
# PBANKA-1102200 - MSP8 - early asexual (from Bozdech paper)

FeaturePlot(pb_sex, features = c("PBANKA-0515000", "PBANKA-1212600","PBANKA-0600600", "PBANKA-0831000", "PBANKA-1315700", "PBANKA-0416100", "PBANKA-1319500", "PBANKA-1437500", "PBANKA-1102200"))
```

### Clustering
```{r}
pb_sex <- FindNeighbors(pb_sex, dims = 1:21)
pb_sex <- FindClusters(pb_sex, resolution = 1)
```

# 6. Remove Doublets {.tabset}

### DoubletFinder

DoubletFinder function
```{r}
## DoubletFinder should be able to be installed and run as so:
#devtools::install_github('chris-mcginnis-ucsf/DoubletFinder')
#library(doubletFinder) #allows removal of doublets
## but there seems to be some problems with this so we will run it from the github code

## the doublet finder function
doubletFinder_v3 <- function(seu, PCs, pN = 0.25, pK, nExp, reuse.pANN = FALSE, sct = FALSE) {
  require(Seurat); require(fields); require(KernSmooth)

  ## Generate new list of doublet classificatons from existing pANN vector to save time
  if (reuse.pANN != FALSE ) {
    pANN.old <- seu@meta.data[ , reuse.pANN]
    classifications <- rep("Singlet", length(pANN.old))
    classifications[order(pANN.old, decreasing=TRUE)[1:nExp]] <- "Doublet"
    seu@meta.data[, paste("DF.classifications",pN,pK,nExp,sep="_")] <- classifications
    return(seu)
  }

  if (reuse.pANN == FALSE) {
    ## Make merged real-artifical data
    real.cells <- rownames(seu@meta.data)
    data <- seu@assays$RNA@counts[, real.cells]
    n_real.cells <- length(real.cells)
    n_doublets <- round(n_real.cells/(1 - pN) - n_real.cells)
    print(paste("Creating",n_doublets,"artificial doublets...",sep=" "))
    real.cells1 <- sample(real.cells, n_doublets, replace = TRUE)
    real.cells2 <- sample(real.cells, n_doublets, replace = TRUE)
    doublets <- (data[, real.cells1] + data[, real.cells2])/2
    colnames(doublets) <- paste("X", 1:n_doublets, sep = "")
    data_wdoublets <- cbind(data, doublets)

    ## Store important pre-processing information
    orig.commands <- seu@commands

    ## Pre-process Seurat object
    if (sct == FALSE) {
      print("Creating Seurat object...")
      seu_wdoublets <- CreateSeuratObject(counts = data_wdoublets)

      print("Normalizing Seurat object...")
      seu_wdoublets <- NormalizeData(seu_wdoublets,
                                     normalization.method = orig.commands$NormalizeData.RNA@params$normalization.method,
                                     scale.factor = orig.commands$NormalizeData.RNA@params$scale.factor,
                                     margin = orig.commands$NormalizeData.RNA@params$margin)

      print("Finding variable genes...")
      seu_wdoublets <- FindVariableFeatures(seu_wdoublets,
                                            selection.method = orig.commands$FindVariableFeatures.RNA$selection.method,
                                            loess.span = orig.commands$FindVariableFeatures.RNA$loess.span,
                                            clip.max = orig.commands$FindVariableFeatures.RNA$clip.max,
                                            mean.function = orig.commands$FindVariableFeatures.RNA$mean.function,
                                            dispersion.function = orig.commands$FindVariableFeatures.RNA$dispersion.function,
                                            num.bin = orig.commands$FindVariableFeatures.RNA$num.bin,
                                            binning.method = orig.commands$FindVariableFeatures.RNA$binning.method,
                                            nfeatures = orig.commands$FindVariableFeatures.RNA$nfeatures,
                                            mean.cutoff = orig.commands$FindVariableFeatures.RNA$mean.cutoff,
                                            dispersion.cutoff = orig.commands$FindVariableFeatures.RNA$dispersion.cutoff)

      print("Scaling data...")
      seu_wdoublets <- ScaleData(seu_wdoublets,
                                 features = orig.commands$ScaleData.RNA$features,
                                 model.use = orig.commands$ScaleData.RNA$model.use,
                                 do.scale = orig.commands$ScaleData.RNA$do.scale,
                                 do.center = orig.commands$ScaleData.RNA$do.center,
                                 scale.max = orig.commands$ScaleData.RNA$scale.max,
                                 block.size = orig.commands$ScaleData.RNA$block.size,
                                 min.cells.to.block = orig.commands$ScaleData.RNA$min.cells.to.block)

      print("Running PCA...")
      seu_wdoublets <- RunPCA(seu_wdoublets,
                              features = orig.commands$ScaleData.RNA$features,
                              npcs = length(PCs),
                              rev.pca =  orig.commands$RunPCA.RNA$rev.pca,
                              weight.by.var = orig.commands$RunPCA.RNA$weight.by.var,
                              verbose=FALSE)
      pca.coord <- seu_wdoublets@reductions$pca@cell.embeddings[ , PCs]
      cell.names <- rownames(seu_wdoublets@meta.data)
      nCells <- length(cell.names)
      rm(seu_wdoublets); gc() # Free up memory
    }

    if (sct == TRUE) {
      require(sctransform)
      print("Creating Seurat object...")
      seu_wdoublets <- CreateSeuratObject(counts = data_wdoublets)

      print("Running SCTransform...")
      seu_wdoublets <- SCTransform(seu_wdoublets)

      print("Running PCA...")
      seu_wdoublets <- RunPCA(seu_wdoublets, npcs = length(PCs))
      pca.coord <- seu_wdoublets@reductions$pca@cell.embeddings[ , PCs]
      cell.names <- rownames(seu_wdoublets@meta.data)
      nCells <- length(cell.names)
      rm(seu_wdoublets); gc()
    }

    ## Compute PC distance matrix
    print("Calculating PC distance matrix...")
    dist.mat <- fields::rdist(pca.coord)

    ## Compute pANN
    print("Computing pANN...")
    pANN <- as.data.frame(matrix(0L, nrow = n_real.cells, ncol = 1))
    rownames(pANN) <- real.cells
    colnames(pANN) <- "pANN"
    k <- round(nCells * pK)
    for (i in 1:n_real.cells) {
      neighbors <- order(dist.mat[, i])
      neighbors <- neighbors[2:(k + 1)]
      neighbor.names <- rownames(dist.mat)[neighbors]
      pANN$pANN[i] <- length(which(neighbors > n_real.cells))/k
    }

    print("Classifying doublets..")
    classifications <- rep("Singlet",n_real.cells)
    classifications[order(pANN$pANN[1:n_real.cells], decreasing=TRUE)[1:nExp]] <- "Doublet"
    seu@meta.data[, paste("pANN",pN,pK,nExp,sep="_")] <- pANN[rownames(seu@meta.data), 1]
    seu@meta.data[, paste("DF.classifications",pN,pK,nExp,sep="_")] <- classifications
    return(seu)
  }
}
## usage: https://rdrr.io/github/chris-mcginnis-ucsf/DoubletFinder/man/doubletFinder_v3.html
## source: https://github.com/chris-mcginnis-ucsf/DoubletFinder/blob/master/R/doubletFinder_v3.R
```

Run DoubletFinder
```{r}
# the tutorial recommends using this as an approximation:
#nExp_poi <- round(0.15*nrow(pb_sex@meta.data))
#but a more appropriate approximation is that the expected number of doublets is ~1% per 1000 cells so:
nExp_poi <- round((0.01*(nrow(pb_sex@meta.data)/1000))*nrow(pb_sex@meta.data))
#run doublet finder:
pb_sex <- doubletFinder_v3(pb_sex, PCs = 1:21, pN = 0.25, pK = 0.01, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)
```

results in:
```{r}
table(pb_sex@meta.data$DF.classifications_0.25_0.01_440)
```

### Validation and visualisation of doublets

visualise where doublets are:
```{r}
doublet.cells <- c(rownames(pb_sex@meta.data[pb_sex@meta.data$DF.classifications_0.25_0.01_440 == "Doublet",]))
d1 <- DimPlot(pb_sex, reduction = "umap", cells.highlight = doublet.cells, sizes.highlight = 2)
#TSNEPlot(object = pb_sex, cells.highlight = doublet.cells, do.return = TRUE, )
doublet1 <- d1 + coord_fixed() + theme(axis.text.x=element_blank())

## plot clusters
cluster_plot <- DimPlot(pb_sex, reduction = "umap", group.by = "ident", label = TRUE)

doublet1 + cluster_plot
```

```{r}
VlnPlot(object = pb_sex, features = "pANN_0.25_0.01_440", pt.size = 0.1)
```

```{r}
## extract meta data cols of interest
df <- pb_sex@meta.data[,c("RNA_snn_res.1","DF.classifications_0.25_0.01_440")]

## make a table from doublets
df <- data.frame(rbind(table(df)))

## ammend rownames
df$cluster <- rownames(df)

## calculate percentages of cells that are doublets
df$pc_doublet <- ((df[,1])/((df[,1]) + df[,2]))*100

## inspect
#kable(df[order(df$pc_doublet),])

## plot 
ggplot(data=df, aes(x=cluster, y=pc_doublet)) +
  geom_col(fill="steelblue") +
  theme_minimal()
```

### Filter doublets

It definitely seems like there are some biases in doublet detection. Fewer doublets in very early rings and in mature sexes may be due to a smaller number of the population being these cells. 

It may also be biological, that these cells are less likely to associate to one another (although less likely, as doublets are a result of the probability of two cells being captured inside the same droplet together at a certain loading concentration, rather than two cells already being together upon droplet capture).

remove doublets:
```{r}
## make a list of singlet cells
keep_singlets <- rownames(pb_sex@meta.data[pb_sex@meta.data$DF.classifications_0.25_0.01_440 == "Singlet",])

## subset into new seurat object
pb_sex_filtered <- subset(pb_sex, cells = keep_singlets, subset.raw = TRUE)

## compare old and new objects
pb_sex
pb_sex_filtered
```

# 7. Life Cycle Stage (Using Bulk RNA-Seq Correlation) {.tabset}

Add in bulk data predictions

### hoo et al.
```{r}
#Pb Prediction correlations with bulk data (asexual hoo): 

#Load in required package:
library(Hmisc)
#Cooerce expression data into a matrix and load in the reference timecourse data:
x10 <- as.matrix(pb_sex_filtered@assays$RNA@data)
rownames(x10) <- gsub("-", "_", rownames(x10))
#read in bulk data:
hoo<-as.matrix(read.table("/Users/Andy/GCSKO/GCSKO_analysis_git/data/Reference/hoo_berg2.txt",header=T, row.names=1))
#Make a blank dataframe in which to add prediction:
df <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(df) <- c("Prediction(Spearman)","r(Spearman)","Prediction(Pearsons)","r(Pearsons)")
#Do correlations with bulk data using both Spearman and Pearson (and the top 1000 genes):
for (i in 1:ncol(x10))
{
  shared<-intersect(row.names(as.matrix(head(sort(x10[,i], decreasing=TRUE),1000))),row.names(hoo))
  step0<-rcorr(x10[shared,i],hoo[shared,1:12],type = "spearman")
  step1<-as.matrix(t(step0$r[2:13,1]))
  step2<-rcorr(x10[shared,i],hoo[shared,1:12],type = "pearson")
  step3<-as.matrix(t(step2$r[2:13,1]))
  step4<-cbind(colnames(step1)[which.max(step1)],step1[which.max(step1)],colnames(step3)[which.max(step3)],step3[which.max(step3)])
  colnames(step4) <- c("Prediction(Spearman)","r(Spearman)","Prediction(Pearsons)","r(Pearsons)")
  rownames(step4)<-colnames(x10)[i]
  df<-rbind(df,step4)
}
#Write out data into a csv file:
#write.csv(dfringr,file="/Users/ar19/Desktop/PhD/AR04_GCSKO_project/All_mutants_Feb_2018/predictionpbcombined.csv")
#Change the format of the output to make it more readable:
#gsub("Pb_","", dfringr[,1]) - Make predictions into 18hr.dat format:

#spearman:
df[,1] <- gsub("Pb_","", df[,1])
#Remove hr.dat from list:
df[,1] <- gsub("hr.dat","", df[,1])
#Check - dfringr[,1]
#Make into a number:
df[,1] <- as.numeric(df[,1])
df[,2] <- as.numeric(as.character(df[,2]))

#pearson:
df[,3] <- gsub("Pb_","", df[,3])
#Remove hr.dat from list:
df[,3] <- gsub("hr.dat","", df[,3])
#Check - dfringr[,1]
#Make into a number:
df[,3] <- as.numeric(df[,3])
df[,4] <- as.numeric(as.character(df[,4]))
#add to 10X object:
pb_sex_filtered <- AddMetaData(pb_sex_filtered, metadata = df)
```

### Kasia's data
Can also do with Kasia's timecourse data:
```{r}
kas<-as.matrix(read.table("/Users/Andy/GCSKO/GCSKO_analysis_git/data/Reference/AP2OETC.txt",header=T, row.names=1))
#Make a blank dataframe in which to add prediction:
dfs <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(dfs) <- c("ID","Prediction","r (Pearson)")
#Do correlations with bulk data using both Spearman and Pearson (and the top 1000 genes):
for (i in 1:ncol(x10))
{
  shared<-intersect(row.names(as.matrix(head(sort(x10[,i], decreasing=TRUE),1000))),rownames(kas))
  step0<-rcorr(x10[shared,i],kas[shared,1:10],type = "spearman")
  step1<-as.matrix(t(step0$r[2:11,1]))
  step2<-rcorr(x10[shared,i],kas[shared,1:10],type = "pearson")
  step3<-as.matrix(t(step2$r[2:11,1]))
  step4<-cbind(colnames(step1)[which.max(step1)],step1[which.max(step1)],colnames(step3)[which.max(step3)],step3[which.max(step3)])
  colnames(step4) <- c("Prediction(Spearman)","r(Spearman)","Prediction(Pearsons)","r(Pearsons)")
  rownames(step4)<-colnames(x10)[i]
  dfs<-rbind(dfs,step4)
}
#Write out data into a csv file:
#write.csv(df,file="/Users/ar19/Desktop/PhD/AR04_GCSKO_project/All_mutants_Feb_2018/predictionkasiacombined.csv")

#Change the format of the output to make it more readable:
#gsub("Pb_","", dfs[,1]) - Make predictions into 18hr.dat format:
dfs[,1] <- gsub("X","", dfs[,1])
#Make into a number:
dfs[,1] <- as.numeric(dfs[,1])
#Make into a number:
dfs[,2] <- as.numeric(as.character(dfs[,2]))

#gsub("Pb_","", dfs[,1]) - Make predictions into 18hr.dat format:
dfs[,3] <- gsub("X","", dfs[,3])
#Make into a number:
dfs[,3] <- as.numeric(dfs[,3])
#dfs[,1]
#Make into a number:
dfs[,4] <- as.numeric(as.character(dfs[,4]))

colnames(dfs) <- c('Prediction(Spearman)_Kasia', 'r(Spearman)_Kasia', 'Prediction(Pearson)_Kasia', 'r(Pearson)_Kasia')
#add to Seurat:
#add to 10X object:
pb_sex_filtered <- AddMetaData(pb_sex_filtered, dfs)
```

### Visualise

Confirm life cycle designations:
```{r}
## plot
FeaturePlot(pb_sex_filtered, features = c("Prediction.Spearman._Kasia", "Prediction.Spearman."))
```

optimse UMAP
```{r}
## PCA calc
pb_sex_filtered <- RunPCA(pb_sex_filtered, features = VariableFeatures(object = pb_sex_filtered))

## elbow plot
ElbowPlot(pb_sex_filtered, ndims = 30, reduction = "pca")

## UMAP calc
pb_sex_filtered <- RunUMAP(pb_sex_filtered, dims = 1:8, seed.use = 300, n.neighbors = 60, min.dist = 0.5, repulsion.strength = 0.05, local.connectivity = 20)

## UMAP plot
DimPlot(pb_sex_filtered, reduction = "umap", group.by = "ident", label = TRUE)
```


# 8. Save and Export {.tabset}

Save environment
```{r}
## This saves everything in the global environment for easy recall later
#save.image(file = "GCSKO_10X_QC.RData")
#load(file = "GCSKO_10X_QC.RData")
```

Save object(s)
```{r}
## save Rdata
#save(pb_30k_sex_filtered, pb_sex_filtered, file = "Part_2_input.Rdata")
#load(file = "Part_2_input.Rdata")

## save RDS
saveRDS(pb_sex_filtered, file = "/Users/Andy/GCSKO/GCSKO_analysis_git/data_to_export/pb_sex_filtered.RDS", compress = FALSE) 
#pb_sex_filtered <- readRDS("pb_sex_filtered.RDS")

## Save Robj
#save(pb_sex_filtered,file="pb_sex_filtered.Robj")
```

# Appendix {.tabset}

## Session Info 
```{r, echo = FALSE}
sessionInfo()
```

